Add a new option `use_namespaces` to WebsiteAgent.

You need namespaces where they make sense.

Akinori MUSHA 10 years ago
parent
commit
e791f75648
2 changed files with 47 additions and 4 deletions
  1. 8 4
      app/models/agents/website_agent.rb
  2. 39 0
      spec/models/agents/website_agent_spec.rb

+ 8 - 4
app/models/agents/website_agent.rb

@@ -33,7 +33,7 @@ module Agents
33 33
 
34 34
       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
35 35
 
36
-      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document.
36
+      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
37 37
 
38 38
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
39 39
 
@@ -302,9 +302,13 @@ module Agents
302 302
     end
303 303
 
304 304
     def use_namespaces?
305
-      interpolated['extract'].none? { |name, extraction_details|
306
-        extraction_details.key?('xpath')
307
-      }
305
+      if value = interpolated.key?('use_namespaces')
306
+        boolify(interpolated['use_namespaces'])
307
+      else
308
+        interpolated['extract'].none? { |name, extraction_details|
309
+          extraction_details.key?('xpath')
310
+        }
311
+      end
308 312
     end
309 313
 
310 314
     def extract_each(&block)

+ 39 - 0
spec/models/agents/website_agent_spec.rb

@@ -401,6 +401,28 @@ describe Agents::WebsiteAgent do
401 401
           expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
402 402
         end
403 403
 
404
+        it "works with XPath with namespaces unstripped" do
405
+          @checker.options['use_namespaces'] = 'true'
406
+          @checker.save!
407
+          expect {
408
+            @checker.check
409
+          }.to change { Event.count }.by(0)
410
+
411
+          @checker.options['extract'] = {
412
+            'title' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => 'normalize-space(./xmlns:title)' },
413
+            'url' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './xmlns:link[1]/@href' },
414
+            'thumbnail' => { 'xpath' => '/xmlns:feed/xmlns:entry', 'value' => './media:thumbnail/@url' },
415
+          }
416
+          @checker.save!
417
+          expect {
418
+            @checker.check
419
+          }.to change { Event.count }.by(20)
420
+          event = Event.last
421
+          expect(event.payload['title']).to eq('Shift to dev group')
422
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
423
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
424
+        end
425
+
404 426
         it "works with CSS selectors" do
405 427
           @checker.options['extract'] = {
406 428
             'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
@@ -429,6 +451,23 @@ describe Agents::WebsiteAgent do
429 451
           expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
430 452
           expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
431 453
         end
454
+
455
+        it "works with CSS selectors with namespaces stripped" do
456
+          @checker.options['extract'] = {
457
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
458
+            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
459
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
460
+          }
461
+          @checker.options['use_namespaces'] = 'false'
462
+          @checker.save!
463
+          expect {
464
+            @checker.check
465
+          }.to change { Event.count }.by(20)
466
+          event = Event.last
467
+          expect(event.payload['title']).to eq('Shift to dev group')
468
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
469
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
470
+        end
432 471
       end
433 472
 
434 473
       describe "JSON" do